Import Library

# Data wrangling Library
library(tidyverse)
library(dplyr) 
# Visualize data
library(ggplot2)
library(inspectdf)
library(GGally)
# Naive Bayes 
library(e1071)
# Splitting Data
library(rsample)
# Random Forest
library(randomForest)
# Smote for unbalanced data
library(DMwR)
# ROCR
library(ROCR)
# Confussion Matrix
library(caret)
telemark <- read_csv2("data/bank-full.csv")
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## Parsed with column specification:
## cols(
##   age = col_double(),
##   job = col_character(),
##   marital = col_character(),
##   education = col_character(),
##   default = col_character(),
##   balance = col_double(),
##   housing = col_character(),
##   loan = col_character(),
##   contact = col_character(),
##   day = col_double(),
##   month = col_character(),
##   duration = col_double(),
##   campaign = col_double(),
##   pdays = col_double(),
##   previous = col_double(),
##   poutcome = col_character(),
##   y = col_character()
## )
glimpse(telemark)
## Observations: 45,211
## Variables: 17
## $ age       <dbl> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57,…
## $ job       <chr> "management", "technician", "entrepreneur", "blue-collar", …
## $ marital   <chr> "married", "single", "married", "married", "single", "marri…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown",…
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no"…
## $ balance   <dbl> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 7…
## $ housing   <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes…
## $ loan      <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no…
## $ contact   <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unk…
## $ day       <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ month     <chr> "may", "may", "may", "may", "may", "may", "may", "may", "ma…
## $ duration  <dbl> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517…
## $ campaign  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ pdays     <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,…
## $ previous  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ poutcome  <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unk…
## $ y         <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no",…

Input variables:
1. age: age (numeric)
2. job : type of job (categorical: “admin.”,“unknown”,“unemployed”,“management”,“housemaid”,“entrepreneur”,“student”, “blue-collar”,“self-employed”,“retired”,“technician”,“services”)
3. marital : marital status (categorical: “married”,“divorced”,“single”; note: “divorced” means divorced or widowed)
4. education : education (categorical: “unknown”,“secondary”,“primary”,“tertiary”)
5. default: has credit in default? (binary: “yes”,“no”)
6. balance: average yearly balance, in euros (numeric)
7. housing: has housing loan? (binary: “yes”,“no”)
8. loan: has personal loan? (binary: “yes”,“no”)
9. contact: contact communication type (categorical: “unknown”,“telephone”,“cellular”)
10. day: last contact day of the month (numeric)
11. month: last contact month of year (categorical: “jan”, “feb”, “mar”, …, “nov”, “dec”)
12. duration: last contact duration, in seconds (numeric)
13. campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
14. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
15. previous: number of contacts performed before this campaign and for this client (numeric)
16. poutcome: outcome of the previous marketing campaign (categorical: “unknown”,“other”,“failure”,“success”)
17. y: has the client subscribed a term deposit? (binary: “yes”,“no”)

table(is.na(telemark))
## 
##  FALSE 
## 768587
telemark <- telemark %>% 
  mutate(job = as.factor(job),
         marital = as.factor(marital),
         education = as.factor(education),
         default = as.factor(default),
         housing = as.factor(housing),
         loan = as.factor(loan),
         contact = as.factor(contact),
         month = as.factor(month),
         poutcome = as.factor(poutcome),
         subscribe = as.factor(y)) %>% 
  select(-c(y))

Exploratory Data Analysis

summary(telemark)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:9732   divorced: 5207   primary  : 6851  
##  1st Qu.:33.00   management :9458   married :27214   secondary:23202  
##  Median :39.00   technician :7597   single  :12790   tertiary :13301  
##  Mean   :40.94   admin.     :5171                    unknown  : 1857  
##  3rd Qu.:48.00   services   :4154                                     
##  Max.   :95.00   retired    :2264                                     
##                  (Other)    :6835                                     
##  default        balance       housing      loan            contact     
##  no :44396   Min.   : -8019   no :20081   no :37967   cellular :29285  
##  yes:  815   1st Qu.:    72   yes:25130   yes: 7244   telephone: 2906  
##              Median :   448                           unknown  :13020  
##              Mean   :  1362                                            
##              3rd Qu.:  1428                                            
##              Max.   :102127                                            
##                                                                        
##       day            month          duration         campaign     
##  Min.   : 1.00   may    :13766   Min.   :   0.0   Min.   : 1.000  
##  1st Qu.: 8.00   jul    : 6895   1st Qu.: 103.0   1st Qu.: 1.000  
##  Median :16.00   aug    : 6247   Median : 180.0   Median : 2.000  
##  Mean   :15.81   jun    : 5341   Mean   : 258.2   Mean   : 2.764  
##  3rd Qu.:21.00   nov    : 3970   3rd Qu.: 319.0   3rd Qu.: 3.000  
##  Max.   :31.00   apr    : 2932   Max.   :4918.0   Max.   :63.000  
##                  (Other): 6060                                    
##      pdays          previous           poutcome     subscribe  
##  Min.   : -1.0   Min.   :  0.0000   failure: 4901   no :39922  
##  1st Qu.: -1.0   1st Qu.:  0.0000   other  : 1840   yes: 5289  
##  Median : -1.0   Median :  0.0000   success: 1511              
##  Mean   : 40.2   Mean   :  0.5803   unknown:36959              
##  3rd Qu.: -1.0   3rd Qu.:  0.0000                              
##  Max.   :871.0   Max.   :275.0000                              
## 
show_plot(inspect_cor(subset(telemark, select = -c(subscribe))))

ggcorr(telemark, label = T)
## Warning in ggcorr(telemark, label = T): data in column(s) 'job', 'marital',
## 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome',
## 'subscribe' are not numeric and were ignored

numericCols <- unlist(lapply(telemark, is.numeric))
show_plot(inspect_num(telemark[,numericCols]))

prop.table(table(telemark$subscribe))
## 
##        no       yes 
## 0.8830152 0.1169848
set.seed(1)
split <- initial_split(data = telemark, prop = 0.8, strata = subscribe)
telemark_train <- training(split)
telemark_test <- testing(split)
prop.table(table(telemark_train$subscribe))
## 
##        no       yes 
## 0.8832426 0.1167574
# telemark_train_upsample <- upSample(x = telemark_train[, -17], y = telemark_train$subscribe, yname = "subscribe")
telemark_train_upsample <- SMOTE(subscribe ~ ., as.data.frame(telemark_train), perc.over = 100, perc.under = 200)
prop.table(table(telemark_train_upsample$subscribe))
## 
##  no yes 
## 0.5 0.5
model_naive <- naiveBayes(subscribe ~ ., data = telemark_train_upsample)
naive_prediction <- predict(model_naive, telemark_test)
naive_prediction_raw <- as.data.frame(predict(model_naive, telemark_test, type = "raw"))

naive_prediction_raw <- naive_prediction_raw %>%
  mutate(no = round(no,4),
         yes = round(yes,4))
naive_matrix <- confusionMatrix(naive_prediction, telemark_test$subscribe)
table <- as.table(naive_matrix)
table <- as.data.frame(table)

table %>% ggplot(aes(x = Prediction, y = Reference, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), fontface = "bold", color = "white") +
  theme_minimal() +
  theme(legend.position = "none")

matrix_1 <- as.data.frame(t(as.matrix(naive_matrix, what = "overall")))
matrix_2 <- as.data.frame(t(as.matrix(naive_matrix, what = "classes")))
matrix <- cbind(matrix_1, matrix_2)
matrix %>% select(Accuracy, Sensitivity, Specificity, "Pos Pred Value") %>% 
  t()
##                     [,1]
## Accuracy       0.7614466
## Sensitivity    0.7550150
## Specificity    0.8095685
## Pos Pred Value 0.9673896
# ROC
naive_roc <- data.frame(prediction = naive_prediction_raw[,2],
                        trueclass = as.numeric(telemark_test$subscribe=="yes"))
head(naive_roc)
##   prediction trueclass
## 1     0.1772         0
## 2     0.4306         0
## 3     0.1510         0
## 4     0.1432         0
## 5     0.0785         0
## 6     0.0646         0
naive_roc <- prediction(naive_roc$prediction, naive_roc$trueclass) 

# ROC curve
plot(performance(naive_roc, "tpr", "fpr"),
     main = "ROC")
abline(a = 0, b = 1)

# AUC
auc_ROCR_n <- performance(naive_roc, measure = "auc")
auc_ROCR_n <- auc_ROCR_n@y.values[[1]]
auc_ROCR_n
## [1] 0.8456519
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# model tuning - metrics function
metrics <- function(cutoff, prob, ref, postarget, negtarget) 
{
  predict <- as.factor(ifelse(prob >= cutoff, postarget, negtarget))
  conf <- caret::confusionMatrix(predict , ref, positive = postarget)
  acc <- conf$overall[1]
  rec <- conf$byClass[1]
  prec <- conf$byClass[3]
  spec <- conf$byClass[2]
  mat <- t(as.matrix(c(rec , acc , prec, spec))) 
  colnames(mat) <- c("recall", "accuracy", "precicion", "specificity")
  return(mat)
}
co <- seq(0.01,0.99,length=100)
result <- matrix(0,100,4)

# apply function metrics
for(i in 1:100){
  result[i,] = metrics(cutoff = co[i], 
                     prob = naive_prediction_raw$yes, 
                     ref = as.factor(ifelse(telemark_test$subscribe == "yes", 1, 0)), 
                     postarget = "1", 
                     negtarget = "0")
}

# visualize
ggplotly(tibble("Recall" = result[,1],
           "Accuracy" = result[,2],
           "Precision" = result[,3],
           "Specificity" = result[,4],
                   "Cutoff" = co) %>% 
  gather(key = "Metrics", value = "value", 1:4) %>% 
  ggplot(aes(x = Cutoff, y = value, col = Metrics)) +
  geom_line(lwd = 1.5) +
  scale_color_manual(values = c("darkred","darkgreen","orange", "blue")) +
  scale_y_continuous(breaks = seq(0,1,0.1), limits = c(0,1)) +
  scale_x_continuous(breaks = seq(0,1,0.1)) +
  labs(title = "Tradeoff Model Perfomance") +
  theme_minimal() +
  theme(legend.position = "top",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank()))